The slides for this webinar are available here.
First, let’s take a look at the data that is available to us to start to answer these research questions.
How do you read a .csv into R?
# read in biographical data table
bio <- read_csv("bio_data_table.csv")
# read in giving data table
giving <- read_csv("giving_data_table.csv")
You can read multiple data files into the same R session. Each of these files contain fictional data created by the generate_data.R script.
Have you ever connected R to a database?
The following is an example of how to create a sample database in R and to load information from that database. More information on using databases from R can be found here. If you use the tidyverse, you can use the same workflow with information from databases and .csv or Excel files.
# create database connection
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":dbname:")
# put some data in our new database
copy_to(dest = con,
df = bio,
name = "bio_table",
temporary = FALSE)
copy_to(dest = con,
df = giving ,
name = "giving_table",
temporary = FALSE)
# print out our table names
db_list_tables(con)
## [1] "bio_table" "giving_table" "sqlite_stat1" "sqlite_stat4"
# let's take a look at the bio table
tbl(con, "bio_table")
## # Source: table<bio_table> [?? x 13]
## # Database: sqlite 3.30.1 []
## id name household_id deceased country city birthday zip state lat
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 4.22e6 al-B… 1000042 N United… Taho… -18805 79373 TX 33.2
## 2 2.32e6 Fern… 1000214 Y United… Wins… -18791 27105 NC 36.1
## 3 1.01e6 Brow… 1000214 N United… Tayl… -18791 48180 MI 42.2
## 4 8.07e6 Zhan… 1000294 N United… Rose… -18784 77471 TX 29.5
## 5 4.05e6 Redf… 1000294 N United… Arli… -18783 60004 IL 42.1
## 6 4.45e6 Mart… 1000309 N United… Gatl… -18781 37738 TN 35.7
## 7 3.98e6 Reyn… 1000309 N United… Cart… -18778 64835 MO 37.2
## 8 3.10e6 Cerv… 1000334 N United… Sant… -18777 95060 CA 37.0
## 9 8.42e6 Pull… 1000334 N United… Scot… -18768 85254 AZ 33.6
## 10 5.19e6 Gree… 1000374 Y United… Coni… -18767 80433 CO 39.5
## # … with more rows, and 3 more variables: lon <dbl>, capacity <chr>,
## # capacity_source <chr>
# we can use dplyr syntax to query a database
# dplyr automatically converts our r code to sql
# alternatively you can write sql code directly in rmarkdown as well
tbl(con, "bio_table") %>%
filter(state == "NC") %>%
select(name, city, capacity)
## # Source: lazy query [?? x 3]
## # Database: sqlite 3.30.1 []
## name city capacity
## <chr> <chr> <chr>
## 1 Fernandez, Luisovich Winston salem >$1k
## 2 Pratt, Letyraial Mayodan $75k - $100k
## 3 Dorsett, Joshua Greensboro $2.5k - $5k
## 4 Giehm, Bryce Lexington $10k - $25k
## 5 Tillman, Zakkary Charlotte $500k - $750k
## 6 Merritt Jr, Augneea Emerald isle $10k - $25k
## 7 Smith, Keith Huntersville $10k - $25k
## 8 Lee, Aymber Cary $10k - $25k
## 9 Gonzales, Anna Arden >$1k
## 10 Maggard, Heaven Weldon $50k - $75K
## # … with more rows
How do you take a look at your data in R?
bio %>%
glimpse()
## Observations: 100,000
## Variables: 13
## $ id <dbl> 4218829, 2323958, 1006648, 8066824, 4049269, 4451170,…
## $ name <chr> "al-Bari, Humaidaan", "Fernandez, Luisovich", "Brown,…
## $ household_id <dbl> 1000042, 1000214, 1000214, 1000294, 1000294, 1000309,…
## $ deceased <chr> "N", "Y", "N", "N", "N", "N", "N", "N", "N", "Y", "Y"…
## $ country <chr> "United States", "United States", "United States", "U…
## $ city <chr> "Tahoka", "Winston salem", "Taylor", "Rosenberg", "Ar…
## $ birthday <date> 1918-07-08, 1918-07-22, 1918-07-22, 1918-07-29, 1918…
## $ zip <chr> "79373", "27105", "48180", "77471", "60004", "37738",…
## $ state <chr> "TX", "NC", "MI", "TX", "IL", "TN", "MO", "CA", "AZ",…
## $ lat <dbl> 33.16, 36.10, 42.22, 29.54, 42.09, 35.72, 37.15, 37.0…
## $ lon <dbl> -101.79, -80.24, -83.26, -95.79, -87.98, -83.49, -94.…
## $ capacity <chr> "$10k - $25k", ">$1k", NA, "$500k - $750k", "$25k - $…
## $ capacity_source <chr> "institutional", "screening", "screening", "instituti…
giving %>%
glimpse()
## Observations: 540,000
## Variables: 6
## $ `household ID` <dbl> 1117484, 1117484, 8834030, 8834030, 5163380, 5163380, …
## $ ID <dbl> 1235370, 1449053, 8082210, 2008892, 7190414, 7400710, …
## $ `gift id` <dbl> 1000080, 1000080, 1000127, 1000127, 1000144, 1000144, …
## $ `credit Type` <chr> "Hard-Credit", "Soft-Credit", "Hard-Credit", "Soft-Cre…
## $ `gift amt` <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
## $ `gift date` <date> 2015-06-12, 2015-06-12, 2015-06-12, 2015-06-12, 2015-…
Does anything look off about the giving data?
clean_names(giving)
## # A tibble: 540,000 x 6
## household_id id gift_id credit_type gift_amt gift_date
## <dbl> <dbl> <dbl> <chr> <dbl> <date>
## 1 1117484 1235370 1000080 Hard-Credit 10 2015-06-12
## 2 1117484 1449053 1000080 Soft-Credit 10 2015-06-12
## 3 8834030 8082210 1000127 Hard-Credit 10 2015-06-12
## 4 8834030 2008892 1000127 Soft-Credit 10 2015-06-12
## 5 5163380 7190414 1000144 Hard-Credit 10 2015-06-12
## 6 5163380 7400710 1000144 Soft-Credit 10 2015-06-12
## 7 7142206 9585779 1000179 Hard-Credit 10 2015-06-12
## 8 7142206 6264037 1000179 Soft-Credit 10 2015-06-12
## 9 7761622 5283324 1000212 Hard-Credit 10 2015-06-12
## 10 7761622 7017300 1000212 Soft-Credit 10 2015-06-12
## # … with 539,990 more rows
giving <- clean_names(giving)
R uses the NA code for missing values. You can test if a value is missing using the is.na() function.
How many missing values are there in the deceased variable?
is.na(bio$deceased)[1:100]
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE
sum(is.na(bio$deceased))
## [1] 2004
bio %>%
summarise(deceased_na = sum(is.na(deceased)))
## # A tibble: 1 x 1
## deceased_na
## <int>
## 1 2004
bio %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 13
## id name household_id deceased country city birthday zip state lat
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 0 0 0 2004 0 0 9324 10000 10000 10000
## # … with 3 more variables: lon <int>, capacity <int>, capacity_source <int>
giving %>%
summarise_all(funs(sum(is.na(.))))
## # A tibble: 1 x 6
## household_id id gift_id credit_type gift_amt gift_date
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0
Which records are missing zip, state, lat, and lon?
bio %>%
filter(is.na(zip)) %>%
glimpse()
## Observations: 10,000
## Variables: 13
## $ id <dbl> 2606104, 5544635, 6207806, 5994054, 6179552, 3931747,…
## $ name <chr> "Gilman, Michael", "Liang, Nicole", "Charley, William…
## $ household_id <dbl> 1000591, 1001161, 1002266, 1002764, 1004042, 1004044,…
## $ deceased <chr> "N", "N", "N", "N", "N", NA, "N", "N", "N", "N", "N",…
## $ country <chr> "Brazil", "Nigeria", "China", "China", "China", "Chin…
## $ city <chr> "Rio de Janeiro", "Onitsha", "Shenzhen", "Shenzhen", …
## $ birthday <date> 1918-08-25, 1918-09-20, NA, 1918-11-25, 1918-12-17, …
## $ zip <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ state <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lat <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ lon <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ capacity <chr> "$1k - $2.5k", "$75k - $100k", "$10k - $25k", "$2.5k …
## $ capacity_source <chr> "institutional", "screening", "institutional", "insti…
The zipcode package can be used to get lat/lon coordinates for each zipcode’s centroid in the US. This data is also available here.
You can treat character, numeric, and factor variables seperately using variations of the select function.
bio %>%
select_if(is.numeric)
## # A tibble: 100,000 x 4
## id household_id lat lon
## <dbl> <dbl> <dbl> <dbl>
## 1 4218829 1000042 33.2 -102.
## 2 2323958 1000214 36.1 -80.2
## 3 1006648 1000214 42.2 -83.3
## 4 8066824 1000294 29.5 -95.8
## 5 4049269 1000294 42.1 -88.0
## 6 4451170 1000309 35.7 -83.5
## 7 3981554 1000309 37.2 -94.4
## 8 3096132 1000334 37.0 -122.
## 9 8415767 1000334 33.6 -112.
## 10 5191784 1000374 39.5 -105.
## # … with 99,990 more rows
bio %>%
select_if(is.character)
## # A tibble: 100,000 x 8
## name deceased country city zip state capacity capacity_source
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 al-Bari, … N United S… Tahoka 79373 TX $10k - $… institutional
## 2 Fernandez… Y United S… Winston … 27105 NC >$1k screening
## 3 Brown, Ni… N United S… Taylor 48180 MI <NA> screening
## 4 Zhang, Ti… N United S… Rosenberg 77471 TX $500k - … institutional
## 5 Redford, … N United S… Arlingto… 60004 IL $25k - $… screening
## 6 Martinez,… N United S… Gatlinbu… 37738 TN $2.5k - … institutional
## 7 Reynolds,… N United S… Cartervi… 64835 MO $25k - $… screening
## 8 Cervantes… N United S… Santa cr… 95060 CA <NA> institutional
## 9 Puller, B… N United S… Scottsda… 85254 AZ $1k - $2… screening
## 10 Greene, M… Y United S… Conifer 80433 CO <NA> screening
## # … with 99,990 more rows
Which variable is should not be a character?
bio <-
bio %>%
mutate(zip = as.numeric(zip))
How might we recode the missing values for the deceased variable?
bio <-
bio %>%
mutate(deceased_missing = ifelse(is.na(deceased), "Y", "N"),
deceased = ifelse(is.na(deceased), "N", deceased))
Are certain capacity sources missing capacity information?
# capacity source
bio %>%
count(capacity, capacity_source)
## # A tibble: 50 x 3
## capacity capacity_source n
## <chr> <chr> <int>
## 1 >$1k institutional 2189
## 2 >$1k screening 3335
## 3 >$1k <NA> 198
## 4 $100k - $250k institutional 1835
## 5 $100k - $250k screening 2717
## 6 $100k - $250k <NA> 153
## 7 $10k - $25k institutional 5449
## 8 $10k - $25k screening 8172
## 9 $10k - $25k <NA> 462
## 10 $10M - $25M institutional 5
## # … with 40 more rows
bio %>%
filter(is.na(capacity_source)) %>%
count(capacity, capacity_source)
## # A tibble: 15 x 3
## capacity capacity_source n
## <chr> <chr> <int>
## 1 >$1k <NA> 198
## 2 $100k - $250k <NA> 153
## 3 $10k - $25k <NA> 462
## 4 $1k - $2.5k <NA> 202
## 5 $1M - $2.5M <NA> 6
## 6 $2.5k - $5k <NA> 322
## 7 $250k - $500k <NA> 162
## 8 $25k - $50k <NA> 543
## 9 $25M - $50M <NA> 1
## 10 $500k - $750k <NA> 163
## 11 $50k - $75K <NA> 305
## 12 $5k - $10k <NA> 289
## 13 $750k - $1M <NA> 93
## 14 $75k - $100k <NA> 303
## 15 <NA> <NA> 209
Let’s take a closer look at the birthday variable. What do you notice when we sort all birthdays in order?
# birthdays - let's sort all the birthdays in order
bio %>%
select(birthday, deceased) %>%
arrange(birthday)
## # A tibble: 100,000 x 2
## birthday deceased
## <date> <chr>
## 1 1900-01-01 N
## 2 1900-01-01 N
## 3 1900-01-01 N
## 4 1900-01-01 N
## 5 1900-01-01 N
## 6 1900-01-01 N
## 7 1900-01-01 N
## 8 1900-01-01 N
## 9 1900-01-01 N
## 10 1900-01-01 N
## # … with 99,990 more rows
# let's take a look at the distribution of birthdays
bio %>%
select(birthday) %>%
ggplot(aes(x = birthday)) +
geom_histogram()
# let's clean up what appears to be a missing value indicator
bio <-
bio %>%
mutate(birthday = if_else(birthday == as.Date("1/1/1900", "%m/%d/%Y"),
as.Date(NA),
birthday))
# let's take another look
bio %>%
select(birthday) %>%
ggplot(aes(x = birthday)) +
geom_histogram()
# bio table - character variables bar plots
bio %>%
select_if(is.character) %>%
select(-name, -city) %>%
gather("variable", "value") %>%
ggplot(aes(x = value)) +
geom_bar() +
facet_wrap(~variable, scales = "free", nrow = 2) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()
What looks strange?
# clean capacity ratings
sort(unique(bio$capacity))
## [1] ">$1k" "$100k - $250k" "$10k - $25k" "$10M - $25M"
## [5] "$1k - $2.5k" "$1M - $2.5M" "$2.5k - $5k" "$250k - $500k"
## [9] "$25k - $50k" "$25M - $50M" "$500k - $750k" "$50k - $75K"
## [13] "$5k - $10k" "$5M - $10M" "$750k - $1M" "$75k - $100k"
## [17] "2.5M - $5M"
# demo multiple cursors
# [1] ">$1k"
# [2] "$100k - $250k"
# [3] "$10k - $25k"
# [4] "$10M - $25M"
# [5] "$1k - $2.5k"
# [6] "$1M - $2.5M"
# [7] "$2.5k - $5k"
# [8] "$250k - $500k"
# [9] "$25k - $50k"
# [10] "$25M - $50M"
# [11] "$500k - $750k"
# [12] "$50k - $75K"
# [13] "$5k - $10k"
# [14] "$5M - $10M"
# [15] "$750k - $1M"
# [16] "$75k - $100k"
bio <-
bio %>%
mutate(capacity = factor(capacity, levels = c(">$1k",
"$1k - $2.5k",
"$2.5k - $5k",
"$5k - $10k",
"$10k - $25k",
"$25k - $50k",
"$50k - $75K",
"$75k - $100k",
"$100k - $250k",
"$250k - $500k",
"$500k - $750k",
"$750k - $1M",
"$1M - $2.5M",
"2.5M - $5M",
"$5M - $10M",
"$10M - $25M",
"$25M - $50M")))
# let's take another look at those capacities
bio %>%
select(capacity) %>%
ggplot(aes(x = capacity)) +
geom_bar() +
coord_flip()
# state
bio %>%
filter(!is.na(state)) %>%
count(state) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_bar(stat = "identity", fill = "#027854") +
coord_flip() +
ggthemes::theme_tufte() +
labs(y = "Number of Prospects",
x = "Primary Residence State",
title = "Prospects by State")
Is this right? Do we need to exclude some prospects?
# state
state_plot <-
bio %>%
filter(!is.na(state),
deceased == "N",
!duplicated(household_id)) %>%
count(state) %>%
arrange(desc(n)) %>%
slice(1:10) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_bar(stat = "identity", fill = "#027854") +
coord_flip() +
ggthemes::theme_tufte() +
labs(y = "Number of Prospects",
x = "Primary Residence State",
title = "Prospects by State")
ggplotly(state_plot)
How would we plot the distribution of gift dates (i.e., the number of gifts per day)?
# gifts per day
giving %>%
filter(credit_type == "Hard-Credit") %>%
ggplot(aes(x = gift_date)) +
geom_histogram()
How about the distribution of gift amounts?
giving %>%
filter(credit_type == "Hard-Credit") %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
giving %>%
filter(credit_type == "Hard-Credit",
gift_amt < 1000000) %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
giving %>%
filter(credit_type == "Hard-Credit",
gift_amt < 100000) %>%
ggplot(aes(x = gift_amt)) +
geom_histogram()
What’s our first step?
giving <-
giving %>%
mutate(fy = ifelse(month(gift_date) >= 7,
year(gift_date) +1,
year(gift_date)))
giving %>%
count(fy)
## # A tibble: 6 x 2
## fy n
## <dbl> <int>
## 1 2015 5621
## 2 2016 109231
## 3 2017 108057
## 4 2018 107956
## 5 2019 107221
## 6 2020 101914
giving %>%
filter(credit_type == "Hard-Credit") %>%
group_by(fy) %>%
summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 6 x 2
## fy total_giving
## <dbl> <chr>
## 1 2015 $6,472,584
## 2 2016 $121,959,923
## 3 2017 $130,648,642
## 4 2018 $124,596,615
## 5 2019 $136,147,889
## 6 2020 $2,463,722,544
Is this it? What else might we need to account for?
calculateFY <- function(date = Sys.Date(), date.format = "%Y-%m-%d", ytd = FALSE, fiscal.year = 2020){
date <- as.Date(date, date.format)
fy.date <-
ifelse(month(date) %in% c(1:6),
year(date),
year(date) + 1)
if(ytd == TRUE){
fy <- fiscal.year
end.this.fy <- as.Date(paste0("6/30/", fy), format = "%m/%d/%Y")
days.left.this.fy <- end.this.fy - Sys.Date()
end.date.fy <- as.Date(paste0("6/30/", fy.date), format = "%m/%d/%Y")
days.left.date.fy <- end.date.fy - date
if(days.left.date.fy >= days.left.this.fy){
return(fy.date)
}else{
return(NA)
}
}else{
return(fy.date)
}
}
# giving$fy <- unlist(lapply(giving$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))
ytd_table <- tibble(
gift_date = seq(min(giving$gift_date), max(giving$gift_date), by = "day"),
)
ytd_table$fy_ytd <- unlist(lapply(ytd_table$gift_date, function(x) suppressWarnings(calculateFY(x, ytd = TRUE))))
giving %>%
left_join(ytd_table) %>%
filter(!is.na(fy_ytd)) %>%
filter(credit_type == "Hard-Credit") %>%
group_by(fy_ytd) %>%
summarise(total_giving = dollar(sum(gift_amt)))
## # A tibble: 5 x 2
## fy_ytd total_giving
## <dbl> <chr>
## 1 2016 $116,271,647
## 2 2017 $123,927,923
## 3 2018 $118,869,402
## 4 2019 $127,521,450
## 5 2020 $2,463,722,544
There is a fundraising R package in development that may help and is available here.
What might our first step be?
# calculate annual and total giving
# see who is not rated or rated low
giving_by_household_and_fy <-
giving %>%
group_by(household_id, fy) %>%
summarise(giving = sum(gift_amt)) %>%
spread(fy, giving, sep = "") %>%
ungroup() %>%
mutate(total_giving = rowSums(select(., contains("fy")), na.rm = TRUE))
sum(duplicated(giving_by_household_and_fy$household_id))
## [1] 0
bio_with_household_giving <-
bio %>%
filter(!duplicated(household_id)) %>%
left_join(giving_by_household_and_fy)
bio_with_household_giving %>%
filter(capacity_source %in% c(NA, "screening")) %>%
filter(total_giving > 10000) %>%
filter(!is.na(fy2019)) %>%
arrange(desc(total_giving)) %>%
select(name, capacity, capacity_source, contains("fy"), total_giving) %>%
datatable(rownames = FALSE) %>%
formatCurrency(columns = c(3:10), digits = 0)
What might our first step be?
bio_with_household_giving %>%
filter(total_giving > 10000) %>%
filter(!is.na(fy2019)) %>%
leaflet() %>%
addTiles() %>%
addCircleMarkers(clusterOptions = markerClusterOptions(),
label = ~paste0(name, ": ", scales::dollar(total_giving)))